In [ ]:
First you have to import the sqlite3 module and establish a connection to a database named 'movie.db'. As this databases does not exist up to now it gets created.
In [1]:
import sqlite3
conn = sqlite3.connect('movie1.db')
Now to create the needed tables we need to execute our first SQL statements. It is important to call the commit() function if you want your changes to appear in the file. It is recommended to not set the AUTO INCREMENT parameter for the primary keys. In this case the ID just refers to the ROWID column of the table which is automatically created. The AUTO INCREMENT parameter would asure you that for every new entry a higher key is created and smaller keys from deleted entries are not used. If you set the PRIMARY KEY make sure that the data type is INTEGER and not INT otherwise the field won't be filled automatically.
In [2]:
conn.execute('''CREATE TABLE FILM
(ID INTEGER PRIMARY KEY,
TITLE CHAR(200) NOT NULL,
YEAR INTEGER NOT NULL,
GENRE CHAR(50) NOT NULL,
UNIQUE(TITLE, YEAR, GENRE));''')
conn.execute('''CREATE TABLE PERSON
(ID INTEGER PRIMARY KEY,
NAME CHAR(50) NOT NULL,
FIRSTNAME CHAR(50) NOT NULL,
P_ID INTEGER UNIQUE NOT NULL);''')
conn.execute('''CREATE TABLE CINEMA
(ID INTEGER PRIMARY KEY,
NAME CHAR(50) NOT NULL,
CITY CHAR(50) NOT NULL,
UNIQUE(NAME, CITY));''')
conn.execute('''CREATE TABLE PARTICIPATION
(FILM INTEGER NOT NULL,
PERSON INTEGER NOT NULL,
FUNCTION CHAR(20) NOT NULL,
FOREIGN KEY(FILM) REFERENCES FILM(ID),
FOREIGN KEY(PERSON) REFERENCES PERSON(ID),
UNIQUE(FILM, PERSON, FUNCTION));''')
conn.execute('''CREATE TABLE SHOW
(FILM INTEGER NOT NULL,
DATE DATETIME NOT NULL,
CINEMA CHAR(30) NOT NULL,
FOREIGN KEY(FILM) REFERENCES FILM(ID),
FOREIGN KEY(CINEMA) REFERENCES CINEMA(ID),
UNIQUE(FILM, DATE, CINEMA));''')
conn.commit()
To fill the created tables with data we first need the data. For the beginning I manually collected the needed data for four movies and severall cinemas in Bonn and Cologne.
In [3]:
import tmdbsimple as tmdb
tmdb.API_KEY = '7f0bbadc274ac0c100f84d8bf81ef2f6'
disc = tmdb.Discover()
total_pages = disc.movie(page = 1, vote_average_gte = 6.5,
vote_count_gte=100, language='en-US')['total_pages']
all_movies = []
for i in range(1,total_pages+1):
movies = disc.movie(page = i, vote_average_gte = 6.5,
vote_count_gte=50, language='en-US')['results']
all_movies += movies
In [4]:
gen = tmdb.Genres()
genres_dict = {x['id'] : x['name'] for x in gen.list()['genres']}
wrong_genre_ids =[]
for movie in all_movies:
genre_list = []
for gen_id in movie['genre_ids']:
try:
genre_list.append(genres_dict[gen_id])
except KeyError:
wrong_genre_ids.append(gen_id)
movie['genre_string'] = '/'.join(genre_list)
In [5]:
for movie in all_movies:
mov_id = movie['id']
actor_list = []
director_list = []
for char in tmdb.Movies(mov_id).credits()['cast']:
actor_list.append((char['name'],char['id']))
for char in tmdb.Movies(mov_id).credits()['crew']:
if char['job'] == 'Director':
director_list.append((char['name'],char['id']))
movie['actors'] = actor_list
movie['directors'] = director_list
In [6]:
# Cinema Data
cinemas = [('Bonner Kinemathek', 'Bonn'),
('Stern Lichtspiele', 'Bonn'),
('WOKI - Dein Kino!', 'Bonn'),
('Neue Filmbühne', 'Bonn'),
('Rex-Lichtspieltheater', 'Bonn'),
('Cinedom', 'Cologne'),
('Residenz', 'Cologne'),
('Metropolis Lichtspieltheater GmbH', 'Cologne'),
('Filmpalette', 'Cologne'),
('ODEON-Lichtspieltheater GmbH', 'Cologne'),
('Off Broadway Kino', 'Cologne'),
('Theater am Weißhaus', 'Cologne'),
('Cinenova', 'Cologne')
]
First I fill all the tables which do not reference any other tables, namely the FILM, PERSON and CINEMA tables. Therefore I loop over the entries in our data to create insertion statements for every entry using pythons string formating.
In [7]:
# Insert Movies
movie_errors = []
for movie in all_movies:
try:
conn.execute('''INSERT INTO FILM (TITLE,YEAR,GENRE)
VALUES ("{}", "{}", "{}");'''.format(movie['original_title'],
movie['release_date'][:4],
movie['genre_string']))
except sqlite3.IntegrityError:
continue
except sqlite3.OperationalError:
movie_errors.append(movie['original_title'])
cur = conn.cursor()
# Insert Persons
person_errors = []
for movie in all_movies:
for actor in movie['actors']:
actor_id = actor[1]
actor_name = actor[0]
try:
conn.execute('''INSERT INTO PERSON (FIRSTNAME,NAME, P_ID)
VALUES ("{}", "{}", "{}");'''.format(' '.join(actor_name.split()[:-1]),
actor_name.split()[-1],
actor_id))
except sqlite3.IntegrityError:
continue
except sqlite3.OperationalError:
person_errors.append(actor_name)
for director in movie['directors']:
director_id = director[1]
director_name = director[0]
try:
conn.execute('''INSERT INTO PERSON (FIRSTNAME,NAME, P_ID)
VALUES ("{}", "{}", "{}");'''.format(' '.join(director_name.split()[:-1]),
director_name.split()[-1],
director_id))
except sqlite3.IntegrityError:
continue
except sqlite3.OperationalError:
person_errors.append(director_name)
# Insert Cinemas
for cinema in cinemas:
conn.execute('''INSERT INTO CINEMA (NAME, CITY)
VALUES ('{}', '{}');'''.format(*cinema))
conn.commit()
The SHOW table is intended to hold the data when you can see which film in which cinema. As it would be rather difficult to find out the dates and times when each of our four movies where shown in the different cinemas I decided to use random dates in the date column of the table SHOW. Therefore I borrowed the following function from StackOverflow.
In [8]:
from random import randrange
from datetime import timedelta
import datetime
def random_date(start, end):
"""
This function will return a random datetime between two datetime
objects.
"""
delta = end - start
int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
random_second = randrange(int_delta)
return start + timedelta(seconds=random_second)
d1 = datetime.datetime(1980, 1,1)
d2 = datetime.datetime(2016, 1,1)
str(random_date(d1, d2))
Out[8]:
In [ ]:
In [9]:
import random
# Insert Participation
for movie in all_movies:
try:
for actor in movie['actors']:
actor_id = actor[1]
actor_name = actor[0]
conn.execute('''INSERT INTO PARTICIPATION (FILM, PERSON, FUNCTION)
VALUES ((SELECT ID from FILM WHERE TITLE="{}"),
(SELECT ID from PERSON WHERE FIRSTNAME="{}"
AND NAME="{}"
AND P_ID="{}"),
'actor');'''.format(movie['original_title'],
' '.join(actor_name.split()[:-1]),
actor_name.split()[-1],
actor_id))
for director in movie['directors']:
director_id = director[1]
director_name = director[0]
conn.execute('''INSERT INTO PARTICIPATION (FILM, PERSON, FUNCTION) VALUES
((SELECT ID from FILM WHERE TITLE="{}"),
(SELECT ID from PERSON WHERE FIRSTNAME="{}"
AND NAME="{}"
AND P_ID="{}"),
'director');'''.format(movie['original_title'],
' '.join(director_name.split()[:-1]),
director_name.split()[-1],
director_id))
except sqlite3.IntegrityError:
continue
except sqlite3.OperationalError:
pass
# Insert Shows with random dates.
for cinema in cinemas:
for movie in all_movies:
#Do not add all movies to all cinemas
if random.uniform(0, 1) > 0.5:
conn.execute('''INSERT INTO SHOW (FILM, DATE, CINEMA) VALUES
((SELECT ID from FILM WHERE TITLE="{}"),
"{}",
(SELECT ID from CINEMA WHERE NAME="{}" AND CITY="{}")
);'''.format(movie['original_title'],
str(random_date(d1, d2)),*cinema))
else:
continue
conn.commit()
conn.close()